data_df=read.csv("data/cdi.csv") %>%
mutate(CRM_1000=crimes/pop*1000) %>% ## add new variable CRM_1000(the crime rate per 1,000 population)
select(-crimes,-pop) %>% ## Since new variable CRM_1000=crimes/pop*1000, we do not consider these two variables(crimes and pop) any more
select(-id) %>%
mutate(region=as.factor(region))
data_df %>% skimr::skim_without_charts()
| Name | Piped data |
| Number of rows | 440 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| factor | 1 |
| numeric | 12 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| cty | 0 | 1 | 3 | 8 | 0 | 371 | 0 |
| state | 0 | 1 | 2 | 2 | 0 | 48 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| region | 0 | 1 | FALSE | 4 | 3: 152, 2: 108, 1: 103, 4: 77 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| area | 0 | 1 | 1041.41 | 1549.92 | 15.0 | 451.25 | 656.50 | 946.75 | 20062.00 |
| pop18 | 0 | 1 | 28.57 | 4.19 | 16.4 | 26.20 | 28.10 | 30.02 | 49.70 |
| pop65 | 0 | 1 | 12.17 | 3.99 | 3.0 | 9.88 | 11.75 | 13.62 | 33.80 |
| docs | 0 | 1 | 988.00 | 1789.75 | 39.0 | 182.75 | 401.00 | 1036.00 | 23677.00 |
| beds | 0 | 1 | 1458.63 | 2289.13 | 92.0 | 390.75 | 755.00 | 1575.75 | 27700.00 |
| hsgrad | 0 | 1 | 77.56 | 7.02 | 46.6 | 73.88 | 77.70 | 82.40 | 92.90 |
| bagrad | 0 | 1 | 21.08 | 7.65 | 8.1 | 15.28 | 19.70 | 25.33 | 52.30 |
| poverty | 0 | 1 | 8.72 | 4.66 | 1.4 | 5.30 | 7.90 | 10.90 | 36.30 |
| unemp | 0 | 1 | 6.60 | 2.34 | 2.2 | 5.10 | 6.20 | 7.50 | 21.30 |
| pcincome | 0 | 1 | 18561.48 | 4059.19 | 8899.0 | 16118.25 | 17759.00 | 20270.00 | 37541.00 |
| totalinc | 0 | 1 | 7869.27 | 12884.32 | 1141.0 | 2311.00 | 3857.00 | 8654.25 | 184230.00 |
| CRM_1000 | 0 | 1 | 57.29 | 27.33 | 4.6 | 38.10 | 52.43 | 72.60 | 295.99 |
boxplot for each continuous variable
par(mfrow=c(3,4))
boxplot(data_df$area,main='Land area measured in square miles')
boxplot(data_df$pop18,main='Percent of population aged 18-34')
boxplot(data_df$pop65,main='Percent of populationß aged 65+')
boxplot(data_df$docs,main='Number of active physicians')
boxplot(data_df$beds,main='Number of hospital beds')
boxplot(data_df$hsgrad,main='Percent high school graduates')
boxplot(data_df$bagrad,main='Percent bachelor’s degrees')
boxplot(data_df$unemp,main='Percent below poverty level')
boxplot(data_df$pcincome,main='Per capita income')
boxplot(data_df$totalinc,main='Total personal income')
boxplot(data_df$CRM_1000,main='the crime rate per 1,000 population')
pair_df=
data_df %>%
select(-cty,-state)
pairs(pair_df)
## Correlation plot
cor_df=
data_df %>%
select(-cty,-state,-region) %>%
cor() ## Correlation coefficient
corrplot(cor(cor_df), type = "upper", diag = FALSE)
相关系数:|r|<0.4为低度线性相关;0.4≤|r|<0.7为显著性相关;0.7≤|r|<1为高度线性相关
显著相关: pop18 vs bagrad 0.45609703
pop18 vs pop65 -0.616309639
hsgrad vs poverty -0.691750483 hsgrad vs unemp -0.593595788 hsgrad vs pcincome 0.52299613 bagrad vs unemp -0.540906913
bagrad vs pcincome 0.69536186 poverty vs bagrad -0.40842385 unemp vs poverty 0.436947236 pcincome vs poverty -0.60172504 CRM_100 VS poverty 0.471844218
高度线性相关: bed vs docs 0.950464395 hsgrad vs bagrad 0.70778672 totalinc vs docs 0.948110571
totalinc vs beds 0.902061545
data_df %>%
ggplot(aes(beds,docs)) + geom_point()
data_df %>%
ggplot(aes(totalinc,docs)) + geom_point()
qqnorm(data_df$CRM_1000) #非常接近线性
qqnorm(data_df$area)
qqnorm(data_df$pop18) #接近线性
qqnorm(data_df$pop65)
qqnorm(data_df$docs)
qqnorm(data_df$beds)
qqnorm(data_df$hsgrad) #接近线性
qqnorm(data_df$bagrad)
qqnorm(data_df$poverty)
qqnorm(data_df$unemp)
qqnorm(data_df$pcincome) #接近线性
qqnorm(data_df$totalinc)
qqnorm(data_df$totalinc)
## establish a simple linear regression of CRM_1000 VS poverty
slr1 = lm(CRM_1000~poverty,data=data_df)
summary(slr1)
##
## Call:
## lm(formula = CRM_1000 ~ poverty, data = data_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.008 -14.578 -2.561 13.605 208.853
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.1390 2.4435 13.56 <2e-16 ***
## poverty 2.7690 0.2472 11.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.12 on 438 degrees of freedom
## Multiple R-squared: 0.2226, Adjusted R-squared: 0.2209
## F-statistic: 125.4 on 1 and 438 DF, p-value: < 2.2e-16
anova(slr1)
## Analysis of Variance Table
##
## Response: CRM_1000
## Df Sum Sq Mean Sq F value Pr(>F)
## poverty 1 72991 72991 125.44 < 2.2e-16 ***
## Residuals 438 254856 582
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(slr1)
data_df=data_df %>%
select(-cty,-state)
mult.fit = lm(CRM_1000 ~ ., data = data_df)
summary(mult.fit)
##
## Call:
## lm(formula = CRM_1000 ~ ., data = data_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.880 -10.126 -1.627 9.724 193.039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.043e+01 2.973e+01 -2.705 0.007103 **
## area -5.519e-04 7.300e-04 -0.756 0.450046
## pop18 1.368e+00 3.527e-01 3.879 0.000121 ***
## pop65 1.884e-01 3.171e-01 0.594 0.552716
## docs -7.066e-03 2.498e-03 -2.829 0.004896 **
## beds 1.174e-02 1.567e-03 7.490 4.03e-13 ***
## hsgrad 2.076e-01 2.930e-01 0.708 0.479090
## bagrad -3.362e-01 3.191e-01 -1.054 0.292702
## poverty 2.372e+00 4.009e-01 5.915 6.83e-09 ***
## unemp 2.234e-01 5.650e-01 0.395 0.692742
## pcincome 2.487e-03 5.021e-04 4.954 1.05e-06 ***
## totalinc -6.745e-04 2.629e-04 -2.566 0.010632 *
## region2 8.505e+00 2.957e+00 2.877 0.004221 **
## region3 2.493e+01 2.895e+00 8.610 < 2e-16 ***
## region4 2.305e+01 3.663e+00 6.294 7.72e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.43 on 425 degrees of freedom
## Multiple R-squared: 0.5107, Adjusted R-squared: 0.4946
## F-statistic: 31.68 on 14 and 425 DF, p-value: < 2.2e-16
broom::tidy(mult.fit)
## # A tibble: 15 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -80.4 29.7 -2.71 7.10e- 3
## 2 area -0.000552 0.000730 -0.756 4.50e- 1
## 3 pop18 1.37 0.353 3.88 1.21e- 4
## 4 pop65 0.188 0.317 0.594 5.53e- 1
## 5 docs -0.00707 0.00250 -2.83 4.90e- 3
## 6 beds 0.0117 0.00157 7.49 4.03e-13
## 7 hsgrad 0.208 0.293 0.708 4.79e- 1
## 8 bagrad -0.336 0.319 -1.05 2.93e- 1
## 9 poverty 2.37 0.401 5.92 6.83e- 9
## 10 unemp 0.223 0.565 0.395 6.93e- 1
## 11 pcincome 0.00249 0.000502 4.95 1.05e- 6
## 12 totalinc -0.000675 0.000263 -2.57 1.06e- 2
## 13 region2 8.51 2.96 2.88 4.22e- 3
## 14 region3 24.9 2.89 8.61 1.44e-16
## 15 region4 23.1 3.66 6.29 7.72e-10
qqPlot(mult.fit,id.method='identify',simulate = TRUE,main='Q-Q plot')
<<<<<<< HEAD
## [1] 6 282
可以看到所有的点都在直线附近,并都落在置信区间内,这表明正态性假设符合得很完美
进行D-W检验:
durbinWatsonTest(mult.fit)
## lag Autocorrelation D-W Statistic p-value
<<<<<<< HEAD
## 1 0.06792391 1.857479 0.12
=======
## 1 0.06792391 1.857479 0.126
>>>>>>> 29c7401617081becc00fada919dfce6a190c81cc
## Alternative hypothesis: rho != 0
P = 0.124 > 0.05不显著,说明因变量之间无自相关性,互相独立。
crPlots(mult.fit)
成分残差图证实了线性假设,说明线性模型对该数据集是比较合适的。
mult.fit = lm(CRM_1000 ~ ., data = data_df)
ncvTest(mult.fit)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 173.5754, Df = 1, p = < 2.22e-16
p = 2.037e-09 显著,说明误差方差不恒定。选用almost所有predictor的线性回归不满足同方差性 异方差性的出现意味着误差项的方差不恒定,这常常出现在有异常值(Outlier)的数据集上,如果使用标准的回归模型,这些异常值的重要性往往被高估。在这种情况下,标准差和置信区间不一定会变大还是变小。